# Prevent printing of warnings and such in the HTML
knitr::opts_chunk$set(warning = FALSE, message = FALSE, fig.align = "center")

library(tidyverse)

all.data <- read_csv("../../data_frames/all_data.csv")

head(all.data)

This calculation can be done on the biocyc website, but not in the high throughput manner that I require. I’ve created a manual implementation of it which yields the same numbers as their site.

Set up the needed files

The PPS score from biocyc is a way to score alterations to pathways given that the enzyme created by a gene has a known chemical reaction and that reaction is part of a known pathway. The hierarchy there is extremely important for this, it goes genes -> enzymes -> reactions -> pathways. These 4 files contain the information necessary to relate those 4 categories.

(og.ecb.to.frame <- read_tsv("../../biocyc_files/rel_ecb_to_frame.txt") %>%
  dplyr::rename("gene" = 1, "target_id" = 2))
(og.genes.of.enzymes <- read_tsv("../../biocyc_files/rel_enzymes_of_genes.txt") %>%
  dplyr::rename("enzyme" = 1, "gene" = 2))
(og.enzymes.of.reactions <- read_tsv("../../biocyc_files/rel_enzymes_of_reactions.txt") %>%
  dplyr::rename("reaction" = 1, "enzyme" = 2))
(og.reactions.of.pathways <- read_tsv("../../biocyc_files/rel_reactions_of_pathways.txt") %>%
  dplyr::rename("pathway" = 1, "reaction" = 2))

Additionally, a translation of the IDs to useful names

path.trans <- read_tsv("../../biocyc_files/pathway_to_common.txt") %>%
  rename(pathway = Pathways, "common_name" = "Common-Name")

Some of the cells have compounded information, i.e. one entity in the first col has multiple things in the second col. Here I manipulate the data to give 1:1 relationships between the two columns. Rows should increase in all cases.

gene.to.enzyme <- og.genes.of.enzymes %>%
  separate(gene, into = letters, sep = " // ") %>%
  pivot_longer(., cols = letters, names_to = "variable", values_to = "gene") %>%
  select(-variable) %>%
  unique() %>%
  filter(!is.na(gene))

enzyme.to.reaction <- og.enzymes.of.reactions %>%
  separate(enzyme, into = letters, sep = " // ") %>%
  pivot_longer(., cols = letters, names_to = "variable", values_to = "enzyme") %>%
  select(-variable) %>%
  unique() %>%
  filter(!is.na(enzyme))

reaction.to.pathway <- og.reactions.of.pathways %>%
  separate(reaction, into = paste0("v", 1:50), sep = " // ") %>%
  pivot_longer(., cols = paste0("v", 1:50), names_to = "variable", values_to = "reaction") %>%
  select(-variable) %>%
  unique() %>%
  filter(!is.na(reaction))

og.ecb.to.frame
gene.to.enzyme
enzyme.to.reaction
reaction.to.pathway

Check for NA in any of those because that would cause an issue

lapply(list(gene.to.enzyme, enzyme.to.reaction, reaction.to.pathway), function(x){
  col1 <- any(is.na(x[,1]))
  col2 <- any(is.na(x[,2]))
  
  return(paste("col1 have NA?", col1, "col2 have NA?", col2))
})
## [[1]]
## [1] "col1 have NA? FALSE col2 have NA? FALSE"
## 
## [[2]]
## [1] "col1 have NA? FALSE col2 have NA? FALSE"
## 
## [[3]]
## [1] "col1 have NA? FALSE col2 have NA? FALSE"

Lastly, a function that allows one to find ECB numbers that are part of a pathway.

# this takes a pathway ID, something like PWY0-1312, or a vector of IDs
find_ECB_from_path <- function(x){
  v1 <- reaction.to.pathway %>%
    filter(pathway %in% x) %>%
    pull(reaction)

  v2 <- enzyme.to.reaction %>%
    filter(reaction %in% v1) %>%
    pull(enzyme)

  v3 <- gene.to.enzyme %>%
    filter(enzyme %in% v2) %>%
    pull(gene)
  
  og.ecb.to.frame %>%
    filter(gene %in% v3)
}

Prepare input data for PPS calculation

Here, prepare the data that is needed for the PPS calculations. The function that calculates PPS requires a data frame that is target_id, line, and l2fc. Deletions are changed to -1.

real.fc.list <- all.data %>%
  filter(gene_type == "ECB") %>%
  select(line, target_id, ds_log2foldchange_rna, ds_log2foldchange_ribo, deleted) %>%
  pivot_longer(., cols = starts_with("ds_"), names_to = "seqtype", values_to = "l2fc") %>%
  mutate(seqtype = str_remove(seqtype, "ds_log2foldchange_"),
         l2fc = ifelse(deleted == TRUE, -3, l2fc)) %>%
  filter(!is.na(l2fc)) %>%
  split(.$seqtype) %>%
  map(function(x){select(x, -seqtype)})

real.fc.list
## $ribo
## # A tibble: 45,183 x 4
##    line  target_id deleted    l2fc
##    <chr> <chr>     <lgl>     <dbl>
##  1 Ara-1 ECB_00001 FALSE   -0.885 
##  2 Ara-1 ECB_00002 FALSE   -0.989 
##  3 Ara-1 ECB_00003 FALSE   -0.860 
##  4 Ara-1 ECB_00004 FALSE   -0.942 
##  5 Ara-1 ECB_00005 FALSE   -0.274 
##  6 Ara-1 ECB_00006 FALSE    0.0867
##  7 Ara-1 ECB_00007 FALSE    0.261 
##  8 Ara-1 ECB_00008 FALSE   -0.578 
##  9 Ara-1 ECB_00009 FALSE   -0.125 
## 10 Ara-1 ECB_00010 FALSE    1.36  
## # … with 45,173 more rows
## 
## $rna
## # A tibble: 45,313 x 4
##    line  target_id deleted    l2fc
##    <chr> <chr>     <lgl>     <dbl>
##  1 Ara-1 ECB_00001 FALSE   -0.368 
##  2 Ara-1 ECB_00002 FALSE   -0.841 
##  3 Ara-1 ECB_00003 FALSE   -1.06  
##  4 Ara-1 ECB_00004 FALSE   -1.08  
##  5 Ara-1 ECB_00005 FALSE   -0.946 
##  6 Ara-1 ECB_00006 FALSE   -0.0100
##  7 Ara-1 ECB_00007 FALSE    0.0598
##  8 Ara-1 ECB_00008 FALSE   -0.385 
##  9 Ara-1 ECB_00009 FALSE    0.0203
## 10 Ara-1 ECB_00010 FALSE    0.831 
## # … with 45,303 more rows

We also need a set of randomized fold changes to create null distributions. This chunk creates 100 instances of randomized fold changes for each seqtype and line.

# do this 100 times
rand.dfs <- lapply(1:100, function(z){
  # take each of the seqtype dfs
  lapply(real.fc.list, function(x){
    # and split the df by line
    x %>%
    split(.$line) %>%
    map(function(y){
      # reorder the l2fcs
      y$l2fc <- sample(y$l2fc, size = length(y$l2fc), replace = FALSE)
      
      # add an iteration column
      y$iteration <- z
      
      return(y)
    }) %>%
    bind_rows()
  }) %>%
  bind_rows(.id = "seqtype")
}) %>%
  bind_rows()

rand.df.list <- rand.dfs %>%
  split(list(.$iteration, .$seqtype)) %>%
  map(.f = select, -iteration, -seqtype)

rand.df.list[[1]]

Calculate PPS

PPS calculator function, this takes 1 argument, a data frame with columns line, target_id, l2fc.

pps_calc <- function(df, type){
  # relate fcs to genes
  fcs.g <- left_join(df, og.ecb.to.frame, by = "target_id") %>%
    select(-target_id) %>%
    filter(is.na(gene) == FALSE)
  
  # relate the above to an enzyme
  fcs.g.e <- left_join(fcs.g, gene.to.enzyme, by = "gene") %>%
    filter(is.na(enzyme) == FALSE)
  
  # relate the above to a reaction
  fcs.g.e.r <- left_join(fcs.g.e, enzyme.to.reaction, by = "enzyme") %>%
    filter(is.na(reaction) == FALSE)
  
  # calculate the RPS, i.e. max(abs(l2fc))
  rps <- fcs.g.e.r %>%
    group_by(line, reaction) %>%
    summarise(rps = max(abs(l2fc), na.rm = TRUE),
              rps2 = rps^2) %>%
    ungroup() %>%
    select(-rps)
  
  # calculate PPS, i.e. sqrt((sum(rps2)/n rps))
  pps <- left_join(rps, reaction.to.pathway, by = "reaction") %>%
    filter(is.na(pathway) == FALSE) %>%
    group_by(line, pathway) %>%
    summarise(pps = sqrt((sum(rps2, na.rm = TRUE)/n()))) %>%
    ungroup()

  return(pps)
}

Run the calculator on the actual data and randomized data

(real.pps <- lapply(real.fc.list, pps_calc) %>%
  bind_rows(.id = "seqtype") %>%
  mutate(type = "actual") %>%
  arrange(desc(pps)))
(rand.pps <- lapply(rand.df.list, pps_calc) %>%
  bind_rows(.id = "sample") %>% 
  separate(sample, into = c("iteration", "seqtype")) %>%
  filter(!is.infinite(pps)) %>%
  mutate(type = "random"))

Join those two dfs rowwise, the NAs in iteration column for the real data have to be replaced with something otherwise if you read it in and these rows are first, it will have a parsing failure upon reading in because it’s a ton of NAs in a column.

(pps.df <- bind_rows(real.pps, rand.pps) %>%
  left_join(., path.trans, by = "pathway") %>%
   replace_na(replace = list(iteration = "real")))

Save it.

write_csv(pps.df, "../../data_frames/table_s12_pps_scores.csv")
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] forcats_0.5.0   stringr_1.4.0   dplyr_1.0.1     purrr_0.3.4    
## [5] readr_1.3.1     tidyr_1.1.0     tibble_3.0.3    ggplot2_3.3.2  
## [9] tidyverse_1.3.0
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.0 xfun_0.16        haven_2.3.1      lattice_0.20-41 
##  [5] colorspace_1.4-1 vctrs_0.3.2      generics_0.0.2   htmltools_0.5.0 
##  [9] yaml_2.2.1       utf8_1.1.4       blob_1.2.1       rlang_0.4.7     
## [13] pillar_1.4.6     glue_1.4.1       withr_2.2.0      DBI_1.1.0       
## [17] dbplyr_1.4.4     modelr_0.1.8     readxl_1.3.1     lifecycle_0.2.0 
## [21] munsell_0.5.0    gtable_0.3.0     cellranger_1.1.0 rvest_0.3.5     
## [25] evaluate_0.14    knitr_1.29       fansi_0.4.1      broom_0.5.6     
## [29] Rcpp_1.0.5       scales_1.1.1     backports_1.1.8  jsonlite_1.7.0  
## [33] fs_1.4.2         hms_0.5.3        digest_0.6.25    stringi_1.4.6   
## [37] grid_4.0.3       cli_2.0.2        tools_4.0.3      magrittr_1.5    
## [41] crayon_1.3.4     pkgconfig_2.0.3  ellipsis_0.3.1   xml2_1.3.2      
## [45] reprex_0.3.0     lubridate_1.7.9  assertthat_0.2.1 rmarkdown_2.3   
## [49] httr_1.4.1       rstudioapi_0.11  R6_2.4.1         nlme_3.1-149    
## [53] compiler_4.0.3